installing packages

# install.packages("tidyverse")
# install.packages("gapminder")

loading packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(gapminder)
library(broom) #tidy() glance()

viewing data

# gapminder
# View(gapminder)

plot using ggplot - within tidyverse

ggplot(gapminder, aes(x = gdpPercap, y=lifeExp)) +
  geom_point()

ggplot(gapminder %>% mutate(indicator = (country == "United Kingdom")), aes(x = gdpPercap, y = lifeExp)) + 
  geom_point(aes(colour = indicator))

# we can see that the UK has been repeated for a bunch of different years

So, now that we know a country is repeated, we might want to visualise just the latest year for each country:

# piping is part of tidyverse
gapminder %>% nrow()
## [1] 1704
# filter is part of tidyverse
gapminder %>% 
  filter(country == "Afghanistan")
## # A tibble: 12 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## 11 Afghanistan Asia       2002    42.1 25268405      727.
## 12 Afghanistan Asia       2007    43.8 31889923      975.
# now we want to filter by year
gapminder %>%
  filter(year == 2007) %>%
  pull(lifeExp) %>%
  mean()
## [1] 67.00742

Lets combine plotitng and filtering

gapminder %>%
  filter( year == 2007) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point()

Moving on to regressions

gapminder %>%
  filter( year == 2007) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Linear Model

gapminder %>%
  filter( year == 2007) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point() + 
  geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'

Saving variables and using functions

gm2007 <- gapminder %>%
  filter(year == 2007)

lm(lifeExp ~ gdpPercap, data = gm2007)
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gm2007)
## 
## Coefficients:
## (Intercept)    gdpPercap  
##   5.957e+01    6.371e-04

We don’t want this output and instead we want to save this result and apply functions to it

model_lm <- lm(lifeExp ~ gdpPercap, data = gm2007)
summary(model_lm) # gives you more than just the estimated coefficients ie. intercept and slope
## 
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gm2007)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.828  -6.316   1.922   6.898  13.128 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.957e+01  1.010e+00   58.95   <2e-16 ***
## gdpPercap   6.371e-04  5.827e-05   10.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.899 on 140 degrees of freedom
## Multiple R-squared:  0.4606, Adjusted R-squared:  0.4567 
## F-statistic: 119.5 on 1 and 140 DF,  p-value: < 2.2e-16
residuals(model_lm) # residuals of predictors ie. difference between predicted and true values
##           1           2           3           4           5           6 
## -16.3585885  13.0746657   8.7702301 -19.8911299   7.6121709  -0.2705981 
##           7           8           9          10          11          12 
##  -2.7540718  -2.9147296   3.6099346  -1.5913589  -3.7559419   3.5531359 
##          13          14          15          16          17          18 
##  10.5420588 -16.8463317   7.0482188   6.6342522  -8.0460633 -10.2596628 
##          19          20          21          22          23          24 
##  -0.9345570 -10.4367387  -2.0528745 -15.2744773 -10.0003672  10.5952492 
##          25          26          27          28          29          30 
##  10.2357286   8.8592184   4.9580414 -13.2804878  -6.5580766  13.0711521 
##          31          32          33          34          35          36 
## -12.2218631   6.8679441  13.0062081   2.3724697  -3.7107349  -6.1014702 
##          37          38          39          40          41          42 
##   8.8303780  11.0491599   8.2163890   8.6626204 -15.7304355  -1.9342885 
##          43          44          45          46          47          48 
##  -7.0587859  -1.4100171   1.6778622 -11.2449522  -0.5972526  -0.6564938 
##          49          50          51          52          53          54 
##  -0.3895150   2.3716877   7.3891404  -4.1592473 -13.5466984   0.5847459 
##          55          56          57          58          59          60 
##   8.3715872  -2.6677900   2.2982367  -0.8606659   3.5699630   8.8284799 
##          61          62          63          64          65          66 
##   4.0039531  -2.8693162  -6.5967158   4.9175988   2.7776063   8.3369672 
##          67          68          69          70          71          72 
##   2.8681884  10.0898469  -6.3879361   6.7163535   4.1814531 -12.1185481 
##          73          74          75          76          77          78 
##   5.7622523 -17.9735247 -14.1517469   6.7041055  -0.7883088 -11.7464578 
##          79          80          81          82          83          84 
##   6.7419750  -5.7629144   3.4495006   6.2542769   8.9980281   5.2649277 
##          85          86          87          88          89          90 
##   9.0813768   9.1643859 -18.0084483   1.9018953  -9.7249409   3.5240074 
##          91          92          93          94          95          96 
##  -3.2488695   4.5921209  11.5816637  -3.0934674 -13.9898238 -10.8168007 
##          97          98          99         100         101         102 
##   1.8559417   4.2570118   9.7215829   9.5276921   7.1348833  10.0895856 
##         103         104         105         106         107         108 
##   6.1919036   5.4649532   6.8653696  11.9894530   6.0239012 -13.8735532 
##         109         110         111         112         113         114 
##   4.9439324  -0.5856827   2.4052755   8.2010146 -17.5472042  -9.6301791 
##         115         116         117         118         119         120 
##   3.1967583   1.9425134 -11.9967262 -16.1326655   3.0124664  10.3008666 
##         121         122         123         124         125         126 
##  -2.6677248 -22.8283427  -0.2548516  -1.7612700  11.9112315   0.5369554 
##         127         128         129         130         131         132 
##  -7.7542648   6.2983510  -1.7082204  -1.2204860   9.8382065   6.8222933 
##         133         134         135         136         137         138 
##  -8.6967059  -1.2955812  -8.6896144  10.0574246   6.9079504  13.1277383 
##         139         140         141         142 
##  11.9287963   1.6791936 -17.9915824 -16.3779179
predict(model_lm)
##        1        2        3        4        5        6        7        8 
## 60.18659 63.34833 63.53077 62.62213 67.70783 81.50560 82.58307 78.54973 
##        9       10       11       12       13       14       15       16 
## 60.45207 81.03236 60.48394 62.00086 64.30994 67.57433 65.34178 66.37075 
##       17       18       19       20       21       22       23       24 
## 60.34106 59.83966 60.65756 60.86674 82.70587 60.01548 60.65137 67.95775 
##       25       26       27       28       29       30       31       32 
## 62.72527 64.02978 60.19396 59.74249 61.88008 65.71085 60.54986 68.88006 
##       33       34       35       36       37       38       39       40 
## 65.26679 74.11353 82.04273 60.89247 63.40462 63.94484 63.12161 63.21538 
##       41       42       43       44       45       46       47       48 
## 67.30944 59.97429 60.00579 80.72302 78.97914 67.97995 60.04525 80.06249 
##       49       50       51       52       53       54       55       56 
## 60.41152 77.11131 62.86986 60.16625 59.93470 60.33125 61.82641 84.87579 
##       57       58       59       60       61       62       63       64 
## 71.03976 82.61767 61.12804 61.82152 66.96005 62.41432 85.48172 75.82740 
##       65       66       67       68       69       70       71       72 
## 77.76839 64.23003 79.73481 62.44515 60.49794 60.58065 74.44155 89.70655 
##       73       74       75       76       77       78       79       80 
## 66.23075 60.56552 59.82975 67.24789 60.23131 60.04946 67.49903 60.22991 
##       81       82       83       84       85       86       87       88 
## 60.71450 66.54672 67.19697 61.53807 65.46162 61.99961 60.09045 60.16710 
##       89       90       91       92       93       94       95       96 
## 62.63094 60.26099 83.01087 75.61188 61.31734 59.96047 60.84882 91.01280 
##       97       98       99      100      101      102      103      104 
## 73.78406 61.22599 65.81542 62.22431 64.28612 61.59841 69.37110 72.63305 
##      105      106      107      108      109      110      111      112 
## 71.88063 64.45255 66.45210 60.11555 60.58407 73.36268 60.65672 65.80099 
##      113      114      115      116      117      118      119      120 
## 60.11520 89.60218 71.46624 75.98349 60.15573 65.47167 77.92853 62.09513 
##      121      122      123      124      125      126      127      128 
## 61.22372 62.44134 81.13885 83.46227 62.23177 77.86304 60.27126 64.31765 
##      129      130      131      132      133      134      135      136 
## 60.12822 71.03949 64.08479 64.95471 60.23871 80.72058 86.93161 66.32658 
##      137      138      139      140      141      142 
## 66.83905 61.12126 61.49320 61.01881 60.37558 59.86492
tidy(model_lm)
## # A tibble: 2 × 5
##   term         estimate std.error statistic   p.value
##   <chr>           <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept) 59.6      1.01           59.0 9.89e-101
## 2 gdpPercap    0.000637 0.0000583      10.9 1.69e- 20
glance(model_lm) # model summary values and allows to compare several functions
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.461         0.457  8.90      120. 1.69e-20     1  -511. 1028. 1037.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
augment(model_lm) 
## # A tibble: 142 × 8
##    lifeExp gdpPercap .fitted  .resid    .hat .sigma   .cooksd .std.resid
##      <dbl>     <dbl>   <dbl>   <dbl>   <dbl>  <dbl>     <dbl>      <dbl>
##  1    43.8      975.    60.2 -16.4   0.0120    8.82 0.0207       -1.85  
##  2    76.4     5937.    63.3  13.1   0.00846   8.86 0.00928       1.48  
##  3    72.3     6223.    63.5   8.77  0.00832   8.90 0.00411       0.990 
##  4    42.7     4797.    62.6 -19.9   0.00907   8.77 0.0231       -2.25  
##  5    75.3    12779.    67.7   7.61  0.00709   8.91 0.00263       0.858 
##  6    81.2    34435.    81.5  -0.271 0.0292    8.93 0.0000144    -0.0309
##  7    79.8    36126.    82.6  -2.75  0.0327    8.93 0.00167      -0.315 
##  8    75.6    29796.    78.5  -2.91  0.0211    8.93 0.00118      -0.331 
##  9    64.1     1391.    60.5   3.61  0.0116    8.93 0.000975      0.408 
## 10    79.4    33693.    81.0  -1.59  0.0278    8.93 0.000471     -0.181 
## # … with 132 more rows
# gives us the original dataset plus predicted value from model, residuals, standard errors, etc
# ie. very useful for applying the results of the model
augment(model_lm) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))

ADDING COMPLEXITY BY CHANGING CLASS OF REGRESSION FUNCTION - LOESS

Adding more complexity, not by adding more predictors, but by allowing a more flexible functional relationship (ie. not linear) using the LOESS function - Locally estimated Scatterplot Smoothing

model_loess <- loess(lifeExp ~ gdpPercap, data = gm2007)
summary(model_loess)
## Call:
## loess(formula = lifeExp ~ gdpPercap, data = gm2007)
## 
## Number of Observations: 142 
## Equivalent Number of Parameters: 4.76 
## Residual Standard Error: 7.119 
## Trace of smoother matrix: 5.2  (exact)
## 
## Control settings:
##   span     :  0.75 
##   degree   :  2 
##   family   :  gaussian
##   surface  :  interpolate      cell = 0.2
##   normalize:  TRUE
##  parametric:  FALSE
## drop.square:  FALSE
# tidy(model_loess)     LOESS function doesnt have tidy()
# glance(model_loess)   LOESS function doesnt have glance()
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))

#@ SPAN - controls the degree of smoothening

LOESS will look at local data only. SPAN will tell us how local the data should be, changing is makes the data points in focus narrower or wider.

# changing span to 0.3 makes a narrower focus and less smoother curve
# closer span is to 1, the smoother the curve gets

model_loess <- loess(lifeExp ~ gdpPercap, 
                     span = 0.3,
                     data = gm2007)
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))

DEGREE - degree of parameters to be used

# This is locally degree 1 - ie. locally linear
# again we increased span to make this curve smoother

model_loess <- loess(lifeExp ~ gdpPercap, 
                     span = 0.9,
                     degree = 1,
                     data = gm2007)
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))

ADDING COMPLEXITY - SQUARE OF PREDICTOR VARIABLE

model_lm2 <- lm(lifeExp ~ gdpPercap + poly(gdpPercap,2), data = gm2007) # poly() creates polynomial of that degree
augment(model_lm2) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))

Now with LOESS model

# This is locally degree 1 - ie. locally linear
# again we increased span to make this curve smoother

model_loess <- loess(lifeExp ~ gdpPercap, 
                     span = 0.5, # fit half of the dataset
                     degree = 2,
                     data = gm2007)
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))

Both lm and loess used degree 2 polynomial but give very different results. This is because loess gives local flexibility vs lm being a global model.

ADDING COMPLEXITY - MORE PREDICTOR VARIABLES

Getting to know the data

library(fivethirtyeight) # has candy ranking dataset
## Some larger datasets need to be installed separately, like senators and
## house_district_forecast. To install these, we recommend you install the
## fivethirtyeightdata package by running:
## install.packages('fivethirtyeightdata', repos =
## 'https://fivethirtyeightdata.github.io/drat/', type = 'source')
?candy_rankings # info about this dataset
candy_rankings # view dataset
## # A tibble: 85 × 13
##    competitorname     chocolate fruity caramel peanutyalmondy nougat
##    <chr>              <lgl>     <lgl>  <lgl>   <lgl>          <lgl> 
##  1 100 Grand          TRUE      FALSE  TRUE    FALSE          FALSE 
##  2 3 Musketeers       TRUE      FALSE  FALSE   FALSE          TRUE  
##  3 One dime           FALSE     FALSE  FALSE   FALSE          FALSE 
##  4 One quarter        FALSE     FALSE  FALSE   FALSE          FALSE 
##  5 Air Heads          FALSE     TRUE   FALSE   FALSE          FALSE 
##  6 Almond Joy         TRUE      FALSE  FALSE   TRUE           FALSE 
##  7 Baby Ruth          TRUE      FALSE  TRUE    TRUE           TRUE  
##  8 Boston Baked Beans FALSE     FALSE  FALSE   TRUE           FALSE 
##  9 Candy Corn         FALSE     FALSE  FALSE   FALSE          FALSE 
## 10 Caramel Apple Pops FALSE     TRUE   TRUE    FALSE          FALSE 
## # … with 75 more rows, and 7 more variables: crispedricewafer <lgl>,
## #   hard <lgl>, bar <lgl>, pluribus <lgl>, sugarpercent <dbl>,
## #   pricepercent <dbl>, winpercent <dbl>
ggplot(candy_rankings, aes(x = pricepercent, y=winpercent)) + 
  geom_point()

Linear Model

model_lm <- lm(winpercent ~ pricepercent, data = candy_rankings)
tidy(model_lm)
## # A tibble: 2 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)      42.0      2.91     14.4  2.39e-24
## 2 pricepercent     17.8      5.30      3.35 1.21e- 3
augment(model_lm) %>%
  ggplot(aes(x = pricepercent, y = winpercent))+
  geom_point()+
  geom_line(aes(y=.fitted))

More Variables - categorical

model_lm <- lm(winpercent ~ pricepercent + chocolate, data = candy_rankings)
tidy(model_lm)
## # A tibble: 3 × 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)      41.6       2.40    17.3   4.35e-29
## 2 pricepercent      1.66      5.08     0.328 7.44e- 1
## 3 chocolateTRUE    18.3       2.91     6.29  1.46e- 8

Notice, when you add categorical predictor to your linear model, you get the same simple regression line but you have different intercepts for the different levels of that categorical variable. Here, you can see an intercept for some baseline level of the categorical variable and then a shift in intercept for other levels relative to baseline level. Here, chocolate false is baseline group (41.57 intercept) and chocolate true is the other group (intercept of 41+18=59).

augment(model_lm) %>%
  ggplot(aes(x = pricepercent, y = winpercent, 
             color = chocolate, 
             shape = chocolate,
             linetype = chocolate))+
  geom_point()+
  geom_line(aes(y=.fitted))

More variable - continuous

We we add continuous variables, we can no longer plot in 2D… Here, let’s try to plot in 3D

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
candy <- candy_rankings
candy3d <- plot_ly(data = candy_rankings,
                   x = ~pricepercent,
                   y = ~sugarpercent,
                   z = ~winpercent,
                   type = "scatter3d")

candy3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
model_lm <- lm(winpercent ~ pricepercent + sugarpercent, data = candy_rankings)

xy_plane <- expand.grid(0:100, 0:100)/100

ps_plane <- xy_plane %>%
  rename(pricepercent = Var1,
         sugarpercent = Var2)

lm_plane <- augment(model_lm, newdata = ps_plane)
lm_matrix <- matrix(lm_plane$.fitted, nrow = 101, ncol = 101)

candy3d %>%
  add_surface(
    x = ~(0:100)/100,
    y = ~(0:100)/100,
    z = ~lm_matrix
  )
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
tidy(model_lm)
## # A tibble: 3 × 5
##   term         estimate std.error statistic  p.value
##   <chr>           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)     39.8       3.44     11.6  6.33e-19
## 2 pricepercent    15.6       5.60      2.78 6.72e- 3
## 3 sugarpercent     6.73      5.66      1.19 2.38e- 1

More variables - both categorical and continuous

chocolate3d <- plot_ly(data=candy_rankings,
                       x = ~pricepercent,
                       y = ~sugarpercent,
                       z = ~winpercent,
                       color = ~chocolate,
                       colors = c("#2d708e","#d8576b"),
                       made = "markers",
                       symbol = ~chocolate,
                       symbols = c("o","circle"),
                       type = "scatter3d",
                       showlegend = FALSE)

chocolate3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
candy <- candy_rankings
model_lm <- lm(winpercent ~ pricepercent + sugarpercent + chocolate, data = candy)

ps_plane <- ps_plane %>% 
  mutate(chocolate = TRUE)
lm_plane <- augment(model_lm, newdata = ps_plane)
lm_matrix_true <- matrix(lm_plane$.fitted, nrow = 101, ncol = 101)

ps_plane <- ps_plane %>%
  mutate(chocolate=FALSE)
lm_plane <- augment(model_lm, newdata=ps_plane)
lm_matrix_false <- matrix(lm_plane$.fitted, nrow = 101, ncol=101)

chocolate3d %>%
  add_surface(
    x = ~(0:100)/100,
    y = ~(0:100)/100,
    z = ~lm_matrix_true,
    showscale = FALSE,
    inherit = FALSE,
    colorscale = list(c(0,1),c("#f0f921","#7201a8"))) %>%
  add_surface(
    x = ~(0:100)/100,
    y = ~(0:100)/100,
    z = ~lm_matrix_false,
    showscale = FALSE,
    inherit = FALSE,
    colorscale = list(c(0,1), c("#3cbb75","#481567")))
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

## Warning: 'scatter3d' objects don't have these attributes: 'made'
## Valid attributes include:
## 'connectgaps', 'customdata', 'customdatasrc', 'error_x', 'error_y', 'error_z', 'hoverinfo', 'hoverinfosrc', 'hoverlabel', 'hovertemplate', 'hovertemplatesrc', 'hovertext', 'hovertextsrc', 'ids', 'idssrc', 'legendgroup', 'legendgrouptitle', 'legendrank', 'line', 'marker', 'meta', 'metasrc', 'mode', 'name', 'opacity', 'projection', 'scene', 'showlegend', 'stream', 'surfaceaxis', 'surfacecolor', 'text', 'textfont', 'textposition', 'textpositionsrc', 'textsrc', 'texttemplate', 'texttemplatesrc', 'transforms', 'type', 'uid', 'uirevision', 'visible', 'x', 'xcalendar', 'xhoverformat', 'xsrc', 'y', 'ycalendar', 'yhoverformat', 'ysrc', 'z', 'zcalendar', 'zhoverformat', 'zsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
tidy(model_lm)
## # A tibble: 4 × 5
##   term          estimate std.error statistic  p.value
##   <chr>            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)      38.6       2.81    13.8   6.50e-23
## 2 pricepercent     -1.66      5.27    -0.315 7.54e- 1
## 3 sugarpercent      9.04      4.63     1.95  5.42e- 2
## 4 chocolateTRUE    18.7       2.87     6.53  5.40e- 9

Another model that uses all of the predictor variables in the dataset

model_lm_all <- lm(winpercent ~ ., candy_rankings %>%
                     select(-competitorname))
tidy(model_lm_all)
## # A tibble: 12 × 5
##    term                 estimate std.error statistic  p.value
##    <chr>                   <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)            34.5        4.32    7.99   1.44e-11
##  2 chocolateTRUE          19.7        3.90    5.07   2.96e- 6
##  3 fruityTRUE              9.42       3.76    2.50   1.45e- 2
##  4 caramelTRUE             2.22       3.66    0.608  5.45e- 1
##  5 peanutyalmondyTRUE     10.1        3.62    2.79   6.81e- 3
##  6 nougatTRUE              0.804      5.72    0.141  8.88e- 1
##  7 crispedricewaferTRUE    8.92       5.27    1.69   9.47e- 2
##  8 hardTRUE               -6.17       3.46   -1.78   7.85e- 2
##  9 barTRUE                 0.442      5.06    0.0872 9.31e- 1
## 10 pluribusTRUE           -0.854      3.04   -0.281  7.79e- 1
## 11 sugarpercent            9.09       4.66    1.95   5.50e- 2
## 12 pricepercent           -5.93       5.51   -1.08   2.86e- 1

Competitorname (ie. candy name) was a unique identifier and we want to avoid using that as a predictor otherwise we wont really be groupping anything.

COMPARE DIFFERENT MODELS

rbind(glance(model_lm), glance(model_lm_all))
## # A tibble: 2 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.433         0.412  11.3     20.6  5.20e-10     3  -325.  659.  671.
## 2     0.540         0.471  10.7      7.80 9.50e- 9    11  -316.  657.  689.
## # … with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

We know that when we add more predictors, the R-squared should increase. Adjusted R-squared penalises R-squared using the ratio of predictors to sample size, so it can decrease when we add more predictors. Here, adjusted R-squared increased for model_lm_all which is an indicator that it was a model better at predicting the outcome.

---
title: "Week 1 Introduction"
author: "Kash"
date: "03/01/2022"
output: 
  html_document: 
    theme: readable
    highlight: pygments
    keep_md: yes
    code_download: true
    toc: true
    toc_float: true
---


# installing packages
```{r}
# install.packages("tidyverse")
# install.packages("gapminder")
```

# loading packages
```{r}
library(tidyverse)
library(gapminder)
library(broom) #tidy() glance()
```

# viewing data
```{r}
# gapminder
# View(gapminder)
```

# plot using ggplot - within tidyverse
```{r}
ggplot(gapminder, aes(x = gdpPercap, y=lifeExp)) +
  geom_point()
```

```{r}
ggplot(gapminder %>% mutate(indicator = (country == "United Kingdom")), aes(x = gdpPercap, y = lifeExp)) + 
  geom_point(aes(colour = indicator))

# we can see that the UK has been repeated for a bunch of different years
```

So, now that we know a country is repeated, we might want to visualise just the latest year for each country:
```{r}
# piping is part of tidyverse
gapminder %>% nrow()

# filter is part of tidyverse
gapminder %>% 
  filter(country == "Afghanistan")

# now we want to filter by year
gapminder %>%
  filter(year == 2007) %>%
  pull(lifeExp) %>%
  mean()
```

Lets combine plotitng and filtering
```{r}
gapminder %>%
  filter( year == 2007) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point()
```

# Moving on to regressions

```{r}
gapminder %>%
  filter( year == 2007) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point() + 
  geom_smooth()
```
Linear Model
```{r}
gapminder %>%
  filter( year == 2007) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point() + 
  geom_smooth(method = "lm")
```

Saving variables and using functions
```{r}
gm2007 <- gapminder %>%
  filter(year == 2007)

lm(lifeExp ~ gdpPercap, data = gm2007)
```

We don't want this output and instead we want to save this result and apply functions to it
```{r}
model_lm <- lm(lifeExp ~ gdpPercap, data = gm2007)
summary(model_lm) # gives you more than just the estimated coefficients ie. intercept and slope
residuals(model_lm) # residuals of predictors ie. difference between predicted and true values
predict(model_lm)
```

```{r}
tidy(model_lm)
```
```{r}
glance(model_lm) # model summary values and allows to compare several functions
```

```{r}
augment(model_lm) 
# gives us the original dataset plus predicted value from model, residuals, standard errors, etc
# ie. very useful for applying the results of the model
```

```{r}
augment(model_lm) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))
```

# ADDING COMPLEXITY BY CHANGING CLASS OF REGRESSION FUNCTION - LOESS

Adding more complexity, not by adding more predictors, but by allowing a more flexible functional relationship (ie. not linear) using the LOESS function - Locally estimated Scatterplot Smoothing
```{r}
model_loess <- loess(lifeExp ~ gdpPercap, data = gm2007)
summary(model_loess)
# tidy(model_loess)     LOESS function doesnt have tidy()
# glance(model_loess)   LOESS function doesnt have glance()
```

```{r}
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))
```

#@ SPAN - controls the degree of smoothening

LOESS will look at local data only. SPAN will tell us how local the data should be, changing is makes the data points in focus narrower or wider.

```{r}
# changing span to 0.3 makes a narrower focus and less smoother curve
# closer span is to 1, the smoother the curve gets

model_loess <- loess(lifeExp ~ gdpPercap, 
                     span = 0.3,
                     data = gm2007)
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))
```

## DEGREE - degree of parameters to be used

```{r}
# This is locally degree 1 - ie. locally linear
# again we increased span to make this curve smoother

model_loess <- loess(lifeExp ~ gdpPercap, 
                     span = 0.9,
                     degree = 1,
                     data = gm2007)
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))
```

# ADDING COMPLEXITY - SQUARE OF PREDICTOR VARIABLE

```{r}
model_lm2 <- lm(lifeExp ~ gdpPercap + poly(gdpPercap,2), data = gm2007) # poly() creates polynomial of that degree
augment(model_lm2) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))
```

Now with LOESS model

```{r}
# This is locally degree 1 - ie. locally linear
# again we increased span to make this curve smoother

model_loess <- loess(lifeExp ~ gdpPercap, 
                     span = 0.5, # fit half of the dataset
                     degree = 2,
                     data = gm2007)
augment(model_loess) %>%
  ggplot(aes(gdpPercap, lifeExp)) + 
  geom_point() +
  geom_line(aes(gdpPercap, .fitted))
```

Both lm and loess used degree 2 polynomial but give very different results. This is because loess gives local flexibility vs lm being a global model.

# ADDING COMPLEXITY - MORE PREDICTOR VARIABLES

Getting to know the data

```{r}
library(fivethirtyeight) # has candy ranking dataset
?candy_rankings # info about this dataset
candy_rankings # view dataset
```

```{r}
ggplot(candy_rankings, aes(x = pricepercent, y=winpercent)) + 
  geom_point()
```

Linear Model

```{r}
model_lm <- lm(winpercent ~ pricepercent, data = candy_rankings)
tidy(model_lm)
```

```{r}
augment(model_lm) %>%
  ggplot(aes(x = pricepercent, y = winpercent))+
  geom_point()+
  geom_line(aes(y=.fitted))
```

More Variables - categorical

```{r}
model_lm <- lm(winpercent ~ pricepercent + chocolate, data = candy_rankings)
tidy(model_lm)
```

Notice, when you add categorical predictor to your linear model, you get the same simple regression line but you have different intercepts for the different levels of that categorical variable.
Here, you can see an intercept for some baseline level of the categorical variable and then a shift in intercept for other levels relative to baseline level. Here, chocolate false is baseline group (41.57 intercept) and chocolate true is the other group (intercept of 41+18=59).

```{r}
augment(model_lm) %>%
  ggplot(aes(x = pricepercent, y = winpercent, 
             color = chocolate, 
             shape = chocolate,
             linetype = chocolate))+
  geom_point()+
  geom_line(aes(y=.fitted))
```

More variable - continuous

```{r}

```

We we add continuous variables, we can no longer plot in 2D... Here, let's try to plot in 3D
```{r}
library(plotly)

candy <- candy_rankings
candy3d <- plot_ly(data = candy_rankings,
                   x = ~pricepercent,
                   y = ~sugarpercent,
                   z = ~winpercent,
                   type = "scatter3d")

candy3d
```

```{r}
model_lm <- lm(winpercent ~ pricepercent + sugarpercent, data = candy_rankings)

xy_plane <- expand.grid(0:100, 0:100)/100

ps_plane <- xy_plane %>%
  rename(pricepercent = Var1,
         sugarpercent = Var2)

lm_plane <- augment(model_lm, newdata = ps_plane)
lm_matrix <- matrix(lm_plane$.fitted, nrow = 101, ncol = 101)

candy3d %>%
  add_surface(
    x = ~(0:100)/100,
    y = ~(0:100)/100,
    z = ~lm_matrix
  )
```

```{r}
tidy(model_lm)
```

More variables - both categorical and continuous

```{r}
chocolate3d <- plot_ly(data=candy_rankings,
                       x = ~pricepercent,
                       y = ~sugarpercent,
                       z = ~winpercent,
                       color = ~chocolate,
                       colors = c("#2d708e","#d8576b"),
                       made = "markers",
                       symbol = ~chocolate,
                       symbols = c("o","circle"),
                       type = "scatter3d",
                       showlegend = FALSE)

chocolate3d
```

```{r}
candy <- candy_rankings
model_lm <- lm(winpercent ~ pricepercent + sugarpercent + chocolate, data = candy)

ps_plane <- ps_plane %>% 
  mutate(chocolate = TRUE)
lm_plane <- augment(model_lm, newdata = ps_plane)
lm_matrix_true <- matrix(lm_plane$.fitted, nrow = 101, ncol = 101)

ps_plane <- ps_plane %>%
  mutate(chocolate=FALSE)
lm_plane <- augment(model_lm, newdata=ps_plane)
lm_matrix_false <- matrix(lm_plane$.fitted, nrow = 101, ncol=101)

chocolate3d %>%
  add_surface(
    x = ~(0:100)/100,
    y = ~(0:100)/100,
    z = ~lm_matrix_true,
    showscale = FALSE,
    inherit = FALSE,
    colorscale = list(c(0,1),c("#f0f921","#7201a8"))) %>%
  add_surface(
    x = ~(0:100)/100,
    y = ~(0:100)/100,
    z = ~lm_matrix_false,
    showscale = FALSE,
    inherit = FALSE,
    colorscale = list(c(0,1), c("#3cbb75","#481567")))
```

```{r}
tidy(model_lm)
```

# Another model that uses all of the predictor variables in the dataset

```{r}
model_lm_all <- lm(winpercent ~ ., candy_rankings %>%
                     select(-competitorname))
tidy(model_lm_all)
```

Competitorname (ie. candy name) was a unique identifier and we want to avoid using that as a predictor otherwise we wont really be groupping anything. 

# COMPARE DIFFERENT MODELS

```{r}
rbind(glance(model_lm), glance(model_lm_all))
```

We know that when we add more predictors, the R-squared should increase.
Adjusted R-squared penalises R-squared using the ratio of predictors to sample size, so it can decrease when we add more predictors. Here, adjusted R-squared increased for model_lm_all which is an indicator that it was a model better at predicting the outcome.








